# =============================================================================
# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates
# SPDX-License-Identifier: GPL-3.0-or-later
# =============================================================================
# SCRIPT  : MAPLE_haplo_fraction.sh
# PROJECT : MAPLE (Methylation-Anchor Probe for Low Enrichment)
# PURPOSE : Quantify on-target and haplotype-specific fragment counts
#
# OVERVIEW:
#   This script processes stitched fragment BED files generated by MAPLE-Stitch,
#   identifies fragments matching predefined haplotype methylation patterns,
#   and calculates per-locus coverage and haplotype frequencies.
#
# INPUTS  :
#   - <sample>.ontarget.stitch.bed : On-target stitched fragments
#   - <pattern>.txt                : Haplotype methylation pattern definition table
#
# OUTPUTS :
#   - <sample>.on_target.coverage  : Per-locus coverage of on-target fragments
#   - <sample>.haplo.coverage      : Per-locus haplotype-specific fragment counts
#   - <sample>_tmp/                : Intermediate files for quality control
#
# USAGE   :
#   sbatch MAPLE_haplo_fraction.sh <sample_prefix> <stitch_pattern_file>
#
# AUTHOR  : Yangjunyi Li
# CREATED : 2023-08-01
# UPDATED : 2025-09-10
#
# NOTE    :
#   - Designed for use after MAPLE fragment stitching (MAPLE_stitch.py).
#   - Uses bedtools and awk for pattern-based fragment extraction.
#   - Final haplotype ratios are computed using MAPLE_cal_haplo_fraction.py.
# =============================================================================

set -euo pipefail

# ------------------- Input Argument Parsing --------------------
sample="$1"
stitch_pattern="$2"

if [[ ! -f "${sample}.ontarget.stitch.bed" ]]; then
    echo "[ERROR] Missing input file: ${sample}.ontarget.stitch.bed"
    exit 1
fi

if [[ ! -f "$stitch_pattern" ]]; then
    echo "[ERROR] Stitch pattern file not found: $stitch_pattern"
    exit 1
fi

echo "[INFO] Sample: $sample"
echo "[INFO] Stitch pattern: $stitch_pattern"

# ------------------- Prepare Uniq BED ---------------------------
echo "[STEP] Generating unique fragment BED file..."
bedtools intersect \
    -a <(bedtools sort -i ${sample}.ontarget.stitch.bed | cut -f1,2,3,8) \
    -b <(grep -v Chr "$stitch_pattern" | cut -f1-4 | uniq) \
    -wa -wb | sort | uniq -c > "${sample}.ontarget.stitch.sorted.uniq.bed"

mkdir -p "${sample}_tmp/"

# ------------------- Output Headers -----------------------------
echo -e "Chromosome\tStart\tEnd\tstitch_pattern_OT\tstitch_pattern_OB\tTarget_counts" > "${sample}.on_target.coverage"
echo -e "Chromosome\tStart\tEnd\tstitch_pattern_OT\tstitch_pattern_OB\tHaplo_counts"  > "${sample}.haplo.coverage"

# ------------------- Pattern Scanning ---------------------------
echo "[STEP] Scanning each haplotype pattern..."
cut -f1-4,6,7 "$stitch_pattern" | bedtools sort -i - | uniq | while read -r line; do
    # Parse fields
    coord=$(echo "$line" | awk '{print $1, $2, $3}')
    pattern=$(echo "$line" | awk '{print $4}')
    fname=$(echo "$pattern" | sed 's/:/_/g; s/-/_/g')

    patternot=$(echo "$line" | awk '{print $(NF-1)}')
    patternob=$(echo "$line" | awk '{print $NF}')

    # Convert to grep patterns
    seqot=$(echo "$patternot" | sed -e 's/:M,/\&\&/g' -e 's/:U,/\&\&/g' -e 's/:M//g' -e 's/:U//g')
    seqob=$(echo "$patternob" | sed -e 's/:M,/\&\&/g' -e 's/:U,/\&\&/g' -e 's/:M//g' -e 's/:U//g')
    filterot=$(echo "$patternot" | sed -e 's/:M,/:N -e /g' -e 's/:U,/:N -e /g' -e 's/:M/:N/g' -e 's/:U/:N/g')
    filterob=$(echo "$patternob" | sed -e 's/:M,/:N -e /g' -e 's/:U,/:N -e /g' -e 's/:M/:N/g' -e 's/:U/:N/g')

    # Filter BED lines matching pattern
    grep "$pattern" "${sample}.ontarget.stitch.sorted.uniq.bed" > "${sample}.${fname}.tmp.bed" || true

    # Extract full covered OT/OB fragments
    awk "/$seqot/" "${sample}.${fname}.tmp.bed" > "${sample}.${fname}.tmp.ontarget.bed" || true
    awk "/$seqob/" "${sample}.${fname}.tmp.bed" >> "${sample}.${fname}.tmp.ontarget.bed" || true

    # Count valid fragments (remove gap-like sites)
    cmdfilter=$(echo "grep -v -e $filterot -e $filterob")
    uniqcountstarget=$(eval "$cmdfilter" "${sample}.${fname}.tmp.ontarget.bed" | awk '{sum+=$1} END{print sum+0}')

    echo -e "${coord} ${patternot} ${patternob} ${uniqcountstarget}" | tr ' ' '\t' >> "${sample}.on_target.coverage"

    # Count haplotype-specific fragments
    uniqcountshaplo=$(grep -e "$patternot" -e "$patternob" "${sample}.${fname}.tmp.ontarget.bed" | awk '{sum+=$1} END{print sum+0}')
    echo -e "${coord} ${patternot} ${patternob} ${uniqcountshaplo}" | tr ' ' '\t' >> "${sample}.haplo.coverage"

    # Archive and clean up
    mv "${sample}.${fname}.tmp.ontarget.bed" "${sample}_tmp/"
    rm -f "${sample}.${fname}.tmp.bed"
done

# ------------------- Run Haplo Ratio Calculation ----------------
echo "[STEP] Calculating haplotype frequency ratios..."
python3 MAPLE_cal_haplo_fraction.py "$sample" "$stitch_pattern"

echo "[DONE] Haplotype fraction analysis complete."